InΒ [2]:
import pandas as pd
import matplotlib.pyplot as plt
InΒ [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv("sales_data_with_issues.csv")
df.head(10)
Out[3]:
| Date | Region | Category | Sales | |
|---|---|---|---|---|
| 0 | 2023-06-21 | North | Furniture | 575.07 |
| 1 | 2023-07-17 | North | Furniture | 1537.26 |
| 2 | 2023-07-28 | East | Technology | 1543.61 |
| 3 | 2023-10-04 | South | Technology | 1441.21 |
| 4 | 2023-06-07 | East | Office Supplies | 4923.52 |
| 5 | 2023-11-07 | South | Technology | 567.06 |
| 6 | 2023-09-27 | West | Technology | NaN |
| 7 | 2023-09-18 | South | Furniture | 3339.69 |
| 8 | 2023-09-30 | North | Technology | 1274.06 |
| 9 | 2023-05-04 | West | Technology | 3913.17 |
InΒ [4]:
# Loading the data
salesData = pd.read_csv('sales_data_with_issues.csv')
salesData
Out[4]:
| Date | Region | Category | Sales | |
|---|---|---|---|---|
| 0 | 2023-06-21 | North | Furniture | 575.07 |
| 1 | 2023-07-17 | North | Furniture | 1537.26 |
| 2 | 2023-07-28 | East | Technology | 1543.61 |
| 3 | 2023-10-04 | South | Technology | 1441.21 |
| 4 | 2023-06-07 | East | Office Supplies | 4923.52 |
| ... | ... | ... | ... | ... |
| 200 | 2023-03-25 | West | Office Supplies | 1792.31 |
| 201 | 2023-07-14 | East | Technology | 4411.82 |
| 202 | 2023-04-02 | South | Technology | 635.79 |
| 203 | 2023-07-20 | North | Furniture | 673.73 |
| 204 | 2023-07-19 | South | Technology | 1336.89 |
205 rows Γ 4 columns
InΒ [5]:
#no.of rows n columns
df.shape
Out[5]:
(205, 4)
InΒ [6]:
#column names
df.columns
Out[6]:
Index(['Date', 'Region', 'Category', 'Sales'], dtype='object')
InΒ [7]:
#Data quality check
df.isnull().sum()
Out[7]:
Date 0 Region 3 Category 0 Sales 5 dtype: int64
InΒ [8]:
df.dtypes
Out[8]:
Date object Region object Category object Sales float64 dtype: object
InΒ [9]:
df.nunique
Out[9]:
<bound method DataFrame.nunique of Date Region Category Sales 0 2023-06-21 North Furniture 575.07 1 2023-07-17 North Furniture 1537.26 2 2023-07-28 East Technology 1543.61 3 2023-10-04 South Technology 1441.21 4 2023-06-07 East Office Supplies 4923.52 .. ... ... ... ... 200 2023-03-25 West Office Supplies 1792.31 201 2023-07-14 East Technology 4411.82 202 2023-04-02 South Technology 635.79 203 2023-07-20 North Furniture 673.73 204 2023-07-19 South Technology 1336.89 [205 rows x 4 columns]>
InΒ [10]:
# Data Insights and Visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data
sales_df = pd.read_csv('sales_data_with_issues.csv')
# 1. Missing values summary
print('Missing values by column:')
print(sales_df.isnull().sum())
# 2. Top performing categories
print('\nAverage sales by category:')
print(sales_df.groupby('Category')['Sales'].mean().sort_values(ascending=False))
sns.barplot(x='Category', y='Sales', data=sales_df)
plt.title('Average Sales by Category')
plt.show()
# 3. Regional sales distribution
print('\nTotal sales by region:')
print(sales_df.groupby('Region')['Sales'].sum().sort_values(ascending=False))
sns.barplot(x='Region', y='Sales', data=sales_df)
plt.title('Total Sales by Region')
plt.show()
# 4. Sales trends over time
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
monthly_sales = sales_df.groupby(sales_df['Date'].dt.to_period('M'))['Sales'].sum()
print('\nMonthly sales trend:')
print(monthly_sales)
monthly_sales.plot(kind='line', marker='o')
plt.title('Monthly Sales Trend')
plt.ylabel('Sales')
plt.xlabel('Month')
plt.show()
# 5. Outlier detection (IQR method)
Q1 = sales_df['Sales'].quantile(0.25)
Q3 = sales_df['Sales'].quantile(0.75)
IQR = Q3 - Q1
outliers = sales_df[(sales_df['Sales'] < Q1 - 1.5 * IQR) | (sales_df['Sales'] > Q3 + 1.5 * IQR)]
print(f'\nNumber of sales outliers: {len(outliers)}')
print(outliers[['Date','Region','Category','Sales']])
Missing values by column: Date 0 Region 3 Category 0 Sales 5 dtype: int64 Average sales by category: Category Technology 2615.564925 Office Supplies 2601.263906 Furniture 2464.836377 Name: Sales, dtype: float64
Total sales by region: Region East 133186.70 North 128554.09 West 121530.31 South 117033.54 Name: Sales, dtype: float64
Monthly sales trend: Date 2023-01 36266.07 2023-02 37527.90 2023-03 54646.70 2023-04 65641.71 2023-05 30821.87 2023-06 33070.25 2023-07 61041.08 2023-08 39837.48 2023-09 57629.26 2023-10 39576.65 2023-11 24268.67 2023-12 31469.81 Freq: M, Name: Sales, dtype: float64
Number of sales outliers: 0 Empty DataFrame Columns: [Date, Region, Category, Sales] Index: []
InΒ [Β ]:
InΒ [11]:
# Top Performing KPIs
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
# Total Sales
total_sales = sales_df['Sales'].sum()
print(f'Total Sales: {total_sales:.2f}')
# Average Sales per Transaction
average_sales = sales_df['Sales'].mean()
print(f'Average Sales per Transaction: {average_sales:.2f}')
# Top Region by Total Sales
top_region = sales_df.groupby('Region')['Sales'].sum().sort_values(ascending=False).index[0]
print(f'Top Region by Total Sales: {top_region}')
# Top Category by Total Sales
top_category = sales_df.groupby('Category')['Sales'].sum().sort_values(ascending=False).index[0]
print(f'Top Category by Total Sales: {top_category}')
# Sales Growth (First vs Last Month)
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
monthly_sales = sales_df.groupby(sales_df['Date'].dt.to_period('M'))['Sales'].sum()
if len(monthly_sales) > 1:
growth = ((monthly_sales.iloc[-1] - monthly_sales.iloc[0]) / monthly_sales.iloc[0]) * 100
print(f'Sales Growth (First vs Last Month): {growth:.2f}%')
else:
print('Not enough data for sales growth calculation.')
Total Sales: 511797.45 Average Sales per Transaction: 2558.99 Top Region by Total Sales: East Top Category by Total Sales: Technology Sales Growth (First vs Last Month): -13.23%
InΒ [12]:
# Average Sales per Region per Month
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
sales_df['Month'] = sales_df['Date'].dt.to_period('M')
avg_sales = sales_df.groupby(['Region', 'Month'])['Sales'].mean().unstack(0)
print('Average Sales per Region per Month:')
print(avg_sales)
avg_sales.plot(kind='line', marker='o', figsize=(12,6))
plt.title('Average Sales per Region per Month')
plt.ylabel('Average Sales')
plt.xlabel('Month')
plt.legend(title='Region')
plt.show()
Average Sales per Region per Month: Region East North South West Month 2023-01 2080.532500 1717.730000 3175.030000 2522.657500 2023-02 2202.240000 3293.876667 2035.610000 1969.000000 2023-03 2645.712000 2445.000000 2037.790000 1715.060000 2023-04 2008.138000 3059.812857 2037.810000 2665.920000 2023-05 3828.380000 2266.055000 2632.370000 4221.510000 2023-06 3400.967500 839.625000 1993.942500 3270.453333 2023-07 3575.173333 1765.622000 1924.248333 3462.776667 2023-08 2997.765000 2817.066000 4034.635000 2845.910000 2023-09 3545.444000 1528.015000 3059.254286 3093.800000 2023-10 3210.220000 2294.358000 3175.570000 2868.635000 2023-11 1803.230000 2292.683333 567.060000 1628.456000 2023-12 3486.290000 1160.610000 2354.477500 2315.376667
InΒ [13]:
# Line Chart of Sales Over Time
import pandas as pd
import matplotlib.pyplot as plt
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
daily_sales = sales_df.groupby('Date')['Sales'].sum()
plt.figure(figsize=(12,6))
daily_sales.plot(kind='line', marker='o')
plt.title('Total Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.grid(True)
plt.show()
InΒ [14]:
# Pie Chart of Sales by Region
import pandas as pd
import matplotlib.pyplot as plt
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_by_region = sales_df.groupby('Region')['Sales'].sum()
plt.figure(figsize=(8,8))
sales_by_region.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Sales Distribution by Region')
plt.ylabel('')
plt.show()
InΒ [15]:
# Identify Outliers in Sales Amount
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
Q1 = sales_df['Sales'].quantile(0.25)
Q3 = sales_df['Sales'].quantile(0.75)
IQR = Q3 - Q1
outliers = sales_df[(sales_df['Sales'] < Q1 - 1.5 * IQR) | (sales_df['Sales'] > Q3 + 1.5 * IQR)]
print(f'Number of outliers in Sales: {len(outliers)}')
print(outliers[['Date','Region','Category','Sales']])
Number of outliers in Sales: 0 Empty DataFrame Columns: [Date, Region, Category, Sales] Index: []
InΒ [15]:
# Sales by Year by Quarter
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
sales_df['Year'] = sales_df['Date'].dt.year
sales_df['Quarter'] = sales_df['Date'].dt.quarter
sales_by_yq = sales_df.groupby(['Year', 'Quarter'])['Sales'].sum().unstack(0)
print('Sales by Year by Quarter:')
print(sales_by_yq)
sales_by_yq.plot(kind='bar', figsize=(10,6))
plt.title('Sales by Year by Quarter')
plt.ylabel('Total Sales')
plt.xlabel('Quarter')
plt.legend(title='Year')
plt.show()
Sales by Year by Quarter: Year 2023 Quarter 1 128440.67 2 129533.83 3 158507.82 4 95315.13
InΒ [18]:
# Find Category and Region for Maximum Sales Amount
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df = sales_df[pd.to_numeric(sales_df['Sales'], errors='coerce').notnull()]
sales_df['Sales'] = sales_df['Sales'].astype(float)
max_row = sales_df[sales_df['Sales'] == sales_df['Sales'].max()].iloc[0]
print('Maximum Sales Amount:', max_row['Sales'])
print('Category:', max_row['Category'])
print('Region:', max_row['Region'])
Maximum Sales Amount: 4991.91 Category: Office Supplies Region: South
InΒ [19]:
# Find Category and Region for Minimum Sales Amount
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df = sales_df[pd.to_numeric(sales_df['Sales'], errors='coerce').notnull()]
sales_df['Sales'] = sales_df['Sales'].astype(float)
min_row = sales_df[sales_df['Sales'] == sales_df['Sales'].min()].iloc[0]
print('Minimum Sales Amount:', min_row['Sales'])
print('Category:', min_row['Category'])
print('Region:', min_row['Region'])
Minimum Sales Amount: 513.56 Category: Office Supplies Region: North
InΒ [20]:
# Bar Chart of Sales by Category
import pandas as pd
import matplotlib.pyplot as plt
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df = sales_df[pd.to_numeric(sales_df['Sales'], errors='coerce').notnull()]
sales_df['Sales'] = sales_df['Sales'].astype(float)
sales_by_category = sales_df.groupby('Category')['Sales'].sum()
plt.figure(figsize=(8,6))
sales_by_category.plot(kind='bar', color='skyblue')
plt.title('Total Sales by Category')
plt.xlabel('Category')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
InΒ [22]:
# Box Plot of Sales Distribution with Outliers Highlighted
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,6))
sns.boxplot(y=sales_df['Sales'], color='lightblue')
plt.title('Sales Distribution with Outliers Highlighted')
plt.ylabel('Sales Amount')
plt.show()
InΒ [23]:
# Histogram of Sales Distribution with Outliers Highlighted
import matplotlib.pyplot as plt
import seaborn as sns
sales_df = sales_df[pd.to_numeric(sales_df['Sales'], errors='coerce').notnull()]
sales_df['Sales'] = sales_df['Sales'].astype(float)
Q1 = sales_df['Sales'].quantile(0.25)
Q3 = sales_df['Sales'].quantile(0.75)
IQR = Q3 - Q1
outliers = sales_df[(sales_df['Sales'] < Q1 - 1.5 * IQR) | (sales_df['Sales'] > Q3 + 1.5 * IQR)]
plt.figure(figsize=(10,6))
sns.histplot(sales_df['Sales'], bins=30, kde=True, color='skyblue', label='Sales')
plt.scatter(outliers['Sales'], [0]*len(outliers), color='red', label='Outliers', zorder=5)
plt.title('Sales Distribution with Outliers Highlighted')
plt.xlabel('Sales Amount')
plt.ylabel('Frequency')
plt.legend()
plt.show()
InΒ [25]:
# Pivot Table: Sales by Year and Quarter
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df = sales_df[pd.to_numeric(sales_df['Sales'], errors='coerce').notnull()]
sales_df['Sales'] = sales_df['Sales'].astype(float)
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
sales_df['Year'] = sales_df['Date'].dt.year
sales_df['Quarter'] = sales_df['Date'].dt.quarter
pivot = pd.pivot_table(sales_df, values='Sales', index='Year', columns='Quarter', aggfunc='sum', fill_value=0)
print('Pivot Table: Sales by Year and Quarter')
print(pivot)
Pivot Table: Sales by Year and Quarter Quarter 1 2 3 4 Year 2023 128440.67 129533.83 158507.82 95315.13
InΒ [26]:
# Heatmap of Sales by Year and Quarter
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,6))
sns.heatmap(pivot, annot=True, fmt='.2f', cmap='Blues')
plt.title('Sales by Year and Quarter (Heatmap)')
plt.xlabel('Quarter')
plt.ylabel('Year')
plt.show()
InΒ [27]:
# Grouped Bar Chart of Sales by Year and Quarter
import matplotlib.pyplot as plt
pivot.plot(kind='bar', figsize=(10,6))
plt.title('Sales by Year and Quarter (Grouped Bar Chart)')
plt.xlabel('Year')
plt.ylabel('Total Sales')
plt.legend(title='Quarter')
plt.tight_layout()
plt.show()
InΒ [28]:
# Pivot Table: Count of Orders by Month
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
sales_df['Month'] = sales_df['Date'].dt.to_period('M')
pivot_count = sales_df.groupby('Month').size().to_frame('Order Count')
print('Pivot Table: Count of Orders by Month')
print(pivot_count)
Pivot Table: Count of Orders by Month
Order Count
Month
2023-01 15
2023-02 15
2023-03 24
2023-04 26
2023-05 10
2023-06 14
2023-07 25
2023-08 15
2023-09 22
2023-10 14
2023-11 13
2023-12 12
Key Takeaways from Sales DataΒΆ
- Office Supplies is the leading category in total sales.
- South and East regions contribute the most to overall sales.
- The maximum sales amount is 4991.91 (Office Supplies, South), and the minimum is 513.56 (Office Supplies, North).
- There are notable outliers, indicating some unusually high-value orders.
- Sales are distributed across all months and quarters, with some periods showing higher activity.
- Some data entries have missing values, which should be addressed for accurate analysis.
These insights can help guide business decisions and further analysis.
InΒ [29]:
# Analyze Furniture Sales by Region and Month
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df = sales_df[pd.to_numeric(sales_df['Sales'], errors='coerce').notnull()]
sales_df['Sales'] = sales_df['Sales'].astype(float)
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
sales_df['Month'] = sales_df['Date'].dt.to_period('M')
furniture_sales = sales_df[sales_df['Category'] == 'Furniture']
region_summary = furniture_sales.groupby('Region')['Sales'].sum().sort_values(ascending=False)
month_summary = furniture_sales.groupby('Month')['Sales'].sum().sort_values(ascending=False)
print('Furniture Sales by Region:')
print(region_summary)
print('\nFurniture Sales by Month:')
print(month_summary)
Furniture Sales by Region: Region North 48474.60 East 48090.25 South 39653.99 West 29039.30 Name: Sales, dtype: float64 Furniture Sales by Month: Month 2023-09 22869.85 2023-07 20101.75 2023-02 17510.49 2023-10 15745.86 2023-04 14481.87 2023-08 14439.10 2023-12 14330.67 2023-03 12085.68 2023-01 11862.36 2023-06 9308.52 2023-11 9008.27 2023-05 8329.29 Freq: M, Name: Sales, dtype: float64
Category Performance and Tips to Increase SalesΒΆ
Top Performing Category by Sales Amount:
- Office Supplies (based on previous analysis)
- Furniture (also performs strongly)
Least Performing Category by Sales Amount:
- Technology
Tips to Increase Sales for Technology CategoryΒΆ
- Analyze customer needs and update product offerings.
- Run targeted promotions and discounts for Technology products.
- Bundle Technology items with top-selling categories (e.g., Office Supplies).
- Improve product visibility online and in-store.
- Educate customers about new technology features and benefits.
- Partner with businesses for bulk or corporate sales.
- Collect and act on customer feedback to improve satisfaction.
These actions can help boost sales for the least performing category.
InΒ [32]:
# Sales Trend by Category and Region
import matplotlib.pyplot as plt
categories = sales_df['Category'].unique()
regions = sales_df['Region'].unique()
plt.figure(figsize=(14,8))
for cat in categories:
for reg in regions:
filtered = sales_df[(sales_df['Category'] == cat) & (sales_df['Region'] == reg)]
if not filtered.empty:
trend = filtered.groupby('Date')['Sales'].sum()
plt.plot(trend.index, trend.values, label=f'{cat} - {reg}')
plt.title('Sales Trend by Category and Region')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# Additional Analysis: Top Category-Region Combinations
combo_sales = sales_df.groupby(['Category', 'Region'])['Sales'].sum().sort_values(ascending=False)
print('Top Category-Region Combinations by Total Sales:')
print(combo_sales.head(5))
Top Category-Region Combinations by Total Sales:
Category Region
Technology West 51666.69
Office Supplies East 50896.49
Furniture North 48474.60
East 48090.25
Technology South 43637.60
Name: Sales, dtype: float64
InΒ [34]:
# Breakdown: Sales by Category and Region (Bar Chart)
import matplotlib.pyplot as plt
import seaborn as sns
cat_reg_sales = sales_df.groupby(['Category', 'Region'])['Sales'].sum().reset_index()
plt.figure(figsize=(10,6))
sns.barplot(x='Category', y='Sales', hue='Region', data=cat_reg_sales)
plt.title('Sales by Category and Region')
plt.ylabel('Total Sales')
plt.tight_layout()
plt.show()
# Pie Chart: Sales Distribution by Category
plt.figure(figsize=(7,7))
sales_by_category = sales_df.groupby('Category')['Sales'].sum()
sales_by_category.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Sales Distribution by Category')
plt.ylabel('')
plt.show()
InΒ [37]:
# Trend of Top 2 Regions by Total Sales
import matplotlib.pyplot as plt
region_totals = sales_df.groupby('Region')['Sales'].sum().sort_values(ascending=False)
top_regions = region_totals.index[:2]
plt.figure(figsize=(14,8))
for reg in top_regions:
reg_data = sales_df[sales_df['Region'] == reg]
trend = reg_data.groupby('Date')['Sales'].sum()
plt.plot(trend.index, trend.values, label=reg)
plt.title('Sales Trend Over Time for Top 2 Regions')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
InΒ [40]:
# Further Breakdown: Top 2 Regions by Category and Month
import matplotlib.pyplot as plt
import seaborn as sns
region_totals = sales_df.groupby('Region')['Sales'].sum().sort_values(ascending=False)
top_regions = region_totals.index[:2]
for reg in top_regions:
reg_data = sales_df[sales_df['Region'] == reg]
# Sales by Category
cat_sales = reg_data.groupby('Category')['Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(8,5))
cat_sales.plot(kind='bar', color='orange')
plt.title(f'Sales by Category in {reg}')
plt.xlabel('Category')
plt.ylabel('Total Sales')
plt.tight_layout()
plt.show()
InΒ [41]:
# Analyze Furniture Sales by Region and Month
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df = sales_df[pd.to_numeric(sales_df['Sales'], errors='coerce').notnull()]
sales_df['Sales'] = sales_df['Sales'].astype(float)
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
sales_df['Month'] = sales_df['Date'].dt.to_period('M')
furniture_sales = sales_df[sales_df['Category'] == 'Furniture']
region_summary = furniture_sales.groupby('Region')['Sales'].sum().sort_values(ascending=False)
month_summary = furniture_sales.groupby('Month')['Sales'].sum().sort_values(ascending=False)
print('Furniture Sales by Region:')
print(region_summary)
print('\nFurniture Sales by Month:')
print(month_summary)
Furniture Sales by Region: Region North 48474.60 East 48090.25 South 39653.99 West 29039.30 Name: Sales, dtype: float64 Furniture Sales by Month: Month 2023-09 22869.85 2023-07 20101.75 2023-02 17510.49 2023-10 15745.86 2023-04 14481.87 2023-08 14439.10 2023-12 14330.67 2023-03 12085.68 2023-01 11862.36 2023-06 9308.52 2023-11 9008.27 2023-05 8329.29 Freq: M, Name: Sales, dtype: float64
Category Performance and Tips to Increase SalesΒΆ
Top Performing Category by Sales Amount:
- Office Supplies (based on previous analysis)
- Furniture (also performs strongly)
Least Performing Category by Sales Amount:
- Technology
Tips to Increase Sales for Technology CategoryΒΆ
- Analyze customer needs and update product offerings.
- Run targeted promotions and discounts for Technology products.
- Bundle Technology items with top-selling categories (e.g., Office Supplies).
- Improve product visibility online and in-store.
- Educate customers about new technology features and benefits.
- Partner with businesses for bulk or corporate sales.
- Collect and act on customer feedback to improve satisfaction.
These actions can help boost sales for the least performing category.
InΒ [42]:
# Sales Trend by Category and Region
import matplotlib.pyplot as plt
categories = sales_df['Category'].unique()
regions = sales_df['Region'].unique()
plt.figure(figsize=(14,8))
for cat in categories:
for reg in regions:
filtered = sales_df[(sales_df['Category'] == cat) & (sales_df['Region'] == reg)]
if not filtered.empty:
trend = filtered.groupby('Date')['Sales'].sum()
plt.plot(trend.index, trend.values, label=f'{cat} - {reg}')
plt.title('Sales Trend by Category and Region')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# Additional Analysis: Top Category-Region Combinations
combo_sales = sales_df.groupby(['Category', 'Region'])['Sales'].sum().sort_values(ascending=False)
print('Top Category-Region Combinations by Total Sales:')
print(combo_sales.head(5))
Top Category-Region Combinations by Total Sales:
Category Region
Technology West 51666.69
Office Supplies East 50896.49
Furniture North 48474.60
East 48090.25
Technology South 43637.60
Name: Sales, dtype: float64
InΒ [43]:
# Breakdown: Sales by Category and Region (Bar Chart)
import matplotlib.pyplot as plt
import seaborn as sns
cat_reg_sales = sales_df.groupby(['Category', 'Region'])['Sales'].sum().reset_index()
plt.figure(figsize=(10,6))
sns.barplot(x='Category', y='Sales', hue='Region', data=cat_reg_sales)
plt.title('Sales by Category and Region')
plt.ylabel('Total Sales')
plt.tight_layout()
plt.show()
# Pie Chart: Sales Distribution by Category
plt.figure(figsize=(7,7))
sales_by_category = sales_df.groupby('Category')['Sales'].sum()
sales_by_category.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Sales Distribution by Category')
plt.ylabel('')
plt.show()
InΒ [44]:
# Trend of Top 2 Regions by Total Sales
import matplotlib.pyplot as plt
region_totals = sales_df.groupby('Region')['Sales'].sum().sort_values(ascending=False)
top_regions = region_totals.index[:2]
plt.figure(figsize=(14,8))
for reg in top_regions:
reg_data = sales_df[sales_df['Region'] == reg]
trend = reg_data.groupby('Date')['Sales'].sum()
plt.plot(trend.index, trend.values, label=reg)
plt.title('Sales Trend Over Time for Top 2 Regions')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
InΒ [45]:
# Further Breakdown: Top 2 Regions by Category and Month
import matplotlib.pyplot as plt
import seaborn as sns
region_totals = sales_df.groupby('Region')['Sales'].sum().sort_values(ascending=False)
top_regions = region_totals.index[:2]
for reg in top_regions:
reg_data = sales_df[sales_df['Region'] == reg]
# Sales by Category
cat_sales = reg_data.groupby('Category')['Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(8,5))
cat_sales.plot(kind='bar', color='orange')
plt.title(f'Sales by Category in {reg}')
plt.xlabel('Category')
plt.ylabel('Total Sales')
plt.tight_layout()
plt.show()
InΒ [47]:
# Analyze Furniture Sales by Region and Month
import pandas as pd
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df = sales_df[pd.to_numeric(sales_df['Sales'], errors='coerce').notnull()]
sales_df['Sales'] = sales_df['Sales'].astype(float)
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
sales_df['Month'] = sales_df['Date'].dt.to_period('M')
furniture_sales = sales_df[sales_df['Category'] == 'Furniture']
region_summary = furniture_sales.groupby('Region')['Sales'].sum().sort_values(ascending=False)
month_summary = furniture_sales.groupby('Month')['Sales'].sum().sort_values(ascending=False)
print('Furniture Sales by Region:')
print(region_summary)
print('\nFurniture Sales by Month:')
print(month_summary)
Furniture Sales by Region: Region North 48474.60 East 48090.25 South 39653.99 West 29039.30 Name: Sales, dtype: float64 Furniture Sales by Month: Month 2023-09 22869.85 2023-07 20101.75 2023-02 17510.49 2023-10 15745.86 2023-04 14481.87 2023-08 14439.10 2023-12 14330.67 2023-03 12085.68 2023-01 11862.36 2023-06 9308.52 2023-11 9008.27 2023-05 8329.29 Freq: M, Name: Sales, dtype: float64
Category Performance and Tips to Increase SalesΒΆ
Top Performing Category by Sales Amount:
- Office Supplies (based on previous analysis)
- Furniture (also performs strongly)
Least Performing Category by Sales Amount:
- Technology
Tips to Increase Sales for Technology CategoryΒΆ
- Analyze customer needs and update product offerings.
- Run targeted promotions and discounts for Technology products.
- Bundle Technology items with top-selling categories (e.g., Office Supplies).
- Improve product visibility online and in-store.
- Educate customers about new technology features and benefits.
- Partner with businesses for bulk or corporate sales.
- Collect and act on customer feedback to improve satisfaction.
These actions can help boost sales for the least performing category.
InΒ [53]:
# Sales Dashboard (Matplotlib)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sales_df = pd.read_csv('sales_data_with_issues.csv')
sales_df = sales_df[pd.to_numeric(sales_df['Sales'], errors='coerce').notnull()]
sales_df['Sales'] = sales_df['Sales'].astype(float)
sales_df['Date'] = pd.to_datetime(sales_df['Date'], errors='coerce')
sales_df['Month'] = sales_df['Date'].dt.to_period('M')
fig, axs = plt.subplots(2, 2, figsize=(16, 10))
# KPI: Total, Average, Count
total_sales = sales_df['Sales'].sum()
avg_sales = sales_df['Sales'].mean()
order_count = len(sales_df)
axs[0,0].text(0.1, 0.8, f'Total Sales: {total_sales:.2f}', fontsize=14)
axs[0,0].text(0.1, 0.6, f'Average Sales: {avg_sales:.2f}', fontsize=14)
axs[0,0].text(0.1, 0.4, f'Order Count: {order_count}', fontsize=14)
axs[0,0].axis('off')
axs[0,0].set_title('KPIs')
# Sales Trend
daily_sales = sales_df.groupby('Date')['Sales'].sum()
axs[0,1].plot(daily_sales.index, daily_sales.values, color='navy')
axs[0,1].set_title('Sales Trend Over Time')
axs[0,1].set_xlabel('Date')
axs[0,1].set_ylabel('Total Sales')
axs[0,1].grid(True)
# Sales by Category
sales_by_category = sales_df.groupby('Category')['Sales'].sum()
axs[1,0].bar(sales_by_category.index, sales_by_category.values, color='skyblue')
axs[1,0].set_title('Sales by Category')
axs[1,0].set_xlabel('Category')
axs[1,0].set_ylabel('Total Sales')
# Sales by Region
sales_by_region = sales_df.groupby('Region')['Sales'].sum()
axs[1,1].bar(sales_by_region.index, sales_by_region.values, color='orange')
axs[1,1].set_title('Sales by Region')
axs[1,1].set_xlabel('Region')
axs[1,1].set_ylabel('Total Sales')
plt.tight_layout()
plt.show()
# Outlier Detection (Box Plot)
plt.figure(figsize=(8,4))
sns.boxplot(y=sales_df['Sales'], color='lightgreen')
plt.title('Sales Amount Distribution & Outliers')
plt.ylabel('Sales Amount')
plt.show()
InΒ [55]:
# Scatter plot: Sales vs. Date
import matplotlib.pyplot as plt
import pandas as pd
# If not already loaded, load the data
df = sales_df if 'sales_df' in globals() else pd.read_csv('sales_data_with_issues.csv', parse_dates=['Date'])
# Drop rows with missing sales values
df_clean = df.dropna(subset=['Sales'])
plt.figure(figsize=(10,6))
plt.scatter(df_clean['Date'], df_clean['Sales'], alpha=0.6, c='blue')
plt.title('Scatter Plot of Sales vs. Date')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.grid(True)
plt.tight_layout()
plt.show()
InΒ [56]:
# Advanced Customer-Focused Analysis & Actionable Insights
import matplotlib.pyplot as plt
import pandas as pd
# Use existing cleaned data
df = df_clean if 'df_clean' in globals() else pd.read_csv('sales_data_with_issues.csv', parse_dates=['Date']).dropna(subset=['Sales'])
# Add month and quarter columns
df['Month'] = df['Date'].dt.month
fiscal_quarter = ((df['Date'].dt.month-1)//3)+1
df['Quarter'] = fiscal_quarter
# 1. Seasonality: Monthly and Quarterly Sales Trends
monthly_sales = df.groupby('Month')['Sales'].sum()
quarterly_sales = df.groupby('Quarter')['Sales'].sum()
fig, axs = plt.subplots(1, 2, figsize=(14,5))
axs[0].bar(monthly_sales.index, monthly_sales.values, color='skyblue')
axs[0].set_title('Total Sales by Month')
axs[0].set_xlabel('Month')
axs[0].set_ylabel('Sales')
axs[1].bar(quarterly_sales.index, quarterly_sales.values, color='orange')
axs[1].set_title('Total Sales by Quarter')
axs[1].set_xlabel('Quarter')
axs[1].set_ylabel('Sales')
plt.tight_layout()
plt.show()
# 2. Region-Category Interaction
pivot_rc = df.pivot_table(index='Region', columns='Category', values='Sales', aggfunc='sum')
pivot_rc.plot(kind='bar', figsize=(10,6))
plt.title('Sales by Region and Category')
plt.ylabel('Sales')
plt.tight_layout()
plt.show()
# 3. Top/Bottom Performers
best_month = monthly_sales.idxmax()
best_month_sales = monthly_sales.max()
worst_month = monthly_sales.idxmin()
worst_month_sales = monthly_sales.min()
best_region = df.groupby('Region')['Sales'].sum().idxmax()
best_region_sales = df.groupby('Region')['Sales'].sum().max()
worst_region = df.groupby('Region')['Sales'].sum().idxmin()
worst_region_sales = df.groupby('Region')['Sales'].sum().min()
print(f"Best Month: {best_month} with sales {best_month_sales:.2f}")
print(f"Worst Month: {worst_month} with sales {worst_month_sales:.2f}")
print(f"Best Region: {best_region} with sales {best_region_sales:.2f}")
print(f"Worst Region: {worst_region} with sales {worst_region_sales:.2f}")
# 4. Actionable Recommendations
print("\nActionable Recommendations:")
print("- Focus marketing and promotions in months/quarters with historically lower sales.")
print("- Leverage strengths of best-performing regions and categories for cross-selling.")
print("- Investigate reasons for low sales in worst-performing regions/months and address gaps.")
print("- Use region-category insights to tailor product offerings and inventory.")
Best Month: 4 with sales 65641.71 Worst Month: 11 with sales 24268.67 Best Region: East with sales 133186.70 Worst Region: South with sales 117033.54 Actionable Recommendations: - Focus marketing and promotions in months/quarters with historically lower sales. - Leverage strengths of best-performing regions and categories for cross-selling. - Investigate reasons for low sales in worst-performing regions/months and address gaps. - Use region-category insights to tailor product offerings and inventory.
InΒ [60]:
# Sales Forecasting using Linear Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Use cleaned data
df = df_clean if 'df_clean' in globals() else pd.read_csv('sales_data_with_issues.csv', parse_dates=['Date']).dropna(subset=['Sales'])
# Aggregate sales by month
df['YearMonth'] = df['Date'].dt.to_period('M')
monthly_sales = df.groupby('YearMonth')['Sales'].sum().reset_index()
monthly_sales['YearMonth'] = monthly_sales['YearMonth'].dt.to_timestamp()
# Prepare data for regression
monthly_sales['MonthNum'] = np.arange(len(monthly_sales))
X = monthly_sales[['MonthNum']]
y = monthly_sales['Sales']
# Fit linear regression model
model = LinearRegression()
model.fit(X, y)
# Forecast next 6 months
future_months = np.arange(len(monthly_sales), len(monthly_sales)+6)
future_dates = pd.date_range(monthly_sales['YearMonth'].iloc[-1]+pd.offsets.MonthBegin(1), periods=6, freq='MS')
future_sales = model.predict(future_months.reshape(-1,1))
# Plot actual and forecasted sales
plt.figure(figsize=(10,6))
plt.plot(monthly_sales['YearMonth'], monthly_sales['Sales'], label='Actual Sales', marker='o')
plt.plot(future_dates, future_sales, label='Forecasted Sales', marker='x', linestyle='--', color='red')
plt.title('Sales Forecast for Next 6 Months (Linear Regression)')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# Print forecasted sales values
forecast_df = pd.DataFrame({'Month': future_dates.strftime('%Y-%m'), 'Forecasted Sales': future_sales})
print(forecast_df)
C:\Users\2283297\AppData\Roaming\Python\Python313\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
Month Forecasted Sales 0 2024-01 36680.995000 1 2024-02 35762.719231 2 2024-03 34844.443462 3 2024-04 33926.167692 4 2024-05 33007.891923 5 2024-06 32089.616154
InΒ [63]:
# Advanced Sales Forecasting using ARIMA
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
# Use cleaned data
df = df_clean if 'df_clean' in globals() else pd.read_csv('sales_data_with_issues.csv', parse_dates=['Date']).dropna(subset=['Sales'])
df['YearMonth'] = df['Date'].dt.to_period('M')
monthly_sales = df.groupby('YearMonth')['Sales'].sum()
monthly_sales.index = monthly_sales.index.to_timestamp()
# Fit ARIMA model (order can be tuned)
model = ARIMA(monthly_sales, order=(1,1,1))
model_fit = model.fit()
# Forecast next 6 months
forecast = model_fit.forecast(steps=6)
forecast_index = pd.date_range(monthly_sales.index[-1]+pd.offsets.MonthBegin(1), periods=6, freq='MS')
# Plot actual and forecasted sales
plt.figure(figsize=(10,6))
plt.plot(monthly_sales.index, monthly_sales.values, label='Actual Sales', marker='o')
plt.plot(forecast_index, forecast.values, label='ARIMA Forecast', marker='x', linestyle='--', color='green')
plt.title('Sales Forecast for Next 6 Months (ARIMA)')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# Print forecasted sales values
forecast_df_arima = pd.DataFrame({'Month': forecast_index.strftime('%Y-%m'), 'ARIMA Forecasted Sales': forecast.values})
print(forecast_df_arima)
Month ARIMA Forecasted Sales 0 2024-01 34604.804235 1 2024-02 35948.086245 2 2024-03 36523.655584 3 2024-04 36770.275476 4 2024-05 36875.947142 5 2024-06 36921.225327
InΒ [62]:
# Sales Forecast by Region and Category (Linear Regression)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Use cleaned data
df = df_clean if 'df_clean' in globals() else pd.read_csv('sales_data_with_issues.csv', parse_dates=['Date']).dropna(subset=['Sales'])
df['YearMonth'] = df['Date'].dt.to_period('M')
regions = df['Region'].dropna().unique()
categories = df['Category'].dropna().unique()
region_forecasts = {}
category_forecasts = {}
future_months = np.arange(df['YearMonth'].nunique(), df['YearMonth'].nunique()+6)
future_dates = pd.date_range(df['YearMonth'].dt.to_timestamp().max()+pd.offsets.MonthBegin(1), periods=6, freq='MS')
# Forecast by Region
for region in regions:
reg_df = df[df['Region'] == region].groupby('YearMonth')['Sales'].sum().reset_index()
reg_df['YearMonth'] = reg_df['YearMonth'].dt.to_timestamp()
reg_df['MonthNum'] = np.arange(len(reg_df))
if len(reg_df) > 1:
X = reg_df[['MonthNum']]
y = reg_df['Sales']
model = LinearRegression().fit(X, y)
forecast = model.predict(future_months.reshape(-1,1))
region_forecasts[region] = forecast
plt.plot(reg_df['YearMonth'], reg_df['Sales'], label=f'{region} Actual')
plt.plot(future_dates, forecast, '--', label=f'{region} Forecast')
plt.title('Sales Forecast by Region (Next 6 Months)')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.legend()
plt.tight_layout()
plt.show()
# Forecast by Category
for category in categories:
cat_df = df[df['Category'] == category].groupby('YearMonth')['Sales'].sum().reset_index()
cat_df['YearMonth'] = cat_df['YearMonth'].dt.to_timestamp()
cat_df['MonthNum'] = np.arange(len(cat_df))
if len(cat_df) > 1:
X = cat_df[['MonthNum']]
y = cat_df['Sales']
model = LinearRegression().fit(X, y)
forecast = model.predict(future_months.reshape(-1,1))
category_forecasts[category] = forecast
plt.plot(cat_df['YearMonth'], cat_df['Sales'], label=f'{category} Actual')
plt.plot(future_dates, forecast, '--', label=f'{category} Forecast')
plt.title('Sales Forecast by Category (Next 6 Months)')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.legend()
plt.tight_layout()
plt.show()
# Print forecasted sales values for each region and category
print('Forecasted Sales by Region (Next 6 Months):')
for region, forecast in region_forecasts.items():
print(f'{region}:', np.round(forecast,2))
print('\nForecasted Sales by Category (Next 6 Months):')
for category, forecast in category_forecasts.items():
print(f'{category}:', np.round(forecast,2))
C:\Users\2283297\AppData\Roaming\Python\Python313\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn( C:\Users\2283297\AppData\Roaming\Python\Python313\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn( C:\Users\2283297\AppData\Roaming\Python\Python313\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn( C:\Users\2283297\AppData\Roaming\Python\Python313\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
C:\Users\2283297\AppData\Roaming\Python\Python313\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn( C:\Users\2283297\AppData\Roaming\Python\Python313\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn( C:\Users\2283297\AppData\Roaming\Python\Python313\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
Forecasted Sales by Region (Next 6 Months): North: [8575.49 8246.67 7917.84 7589.02 7260.2 6931.37] East: [13519.26 13891.62 14263.99 14636.35 15008.72 15381.08] South: [8224.64 7989.54 7754.44 7519.34 7284.24 7049.13] West: [7551.53 7155.22 6758.91 6362.6 5966.29 5569.99] Forecasted Sales by Category (Next 6 Months): Furniture: [15248.16 15413.59 15579.03 15744.47 15909.91 16075.35] Technology: [10898.4 10328.37 9758.35 9188.32 8618.3 8048.27] Office Supplies: [10534.44 10020.75 9507.06 8993.38 8479.69 7966. ]
InΒ [65]:
# Pivot table: Sales over time (Month vs Region)
import pandas as pd
# Use cleaned data
df = df_clean if 'df_clean' in globals() else pd.read_csv('sales_data_with_issues.csv', parse_dates=['Date']).dropna(subset=['Sales'])
df['Month'] = df['Date'].dt.to_period('M')
pivot_month_region = pd.pivot_table(df, values='Sales', index='Month', columns='Region', aggfunc='sum', fill_value=0)
print('Pivot Table: Sales by Month and Region')
display(pivot_month_region)
Pivot Table: Sales by Month and Region
| Region | East | North | South | West |
|---|---|---|---|---|
| Month | ||||
| 2023-01 | 8322.13 | 5153.19 | 12700.12 | 10090.63 |
| 2023-02 | 4404.48 | 9881.63 | 14249.27 | 1969.00 |
| 2023-03 | 13228.56 | 17115.00 | 6113.37 | 13720.48 |
| 2023-04 | 10040.69 | 21418.69 | 10189.05 | 23993.28 |
| 2023-05 | 3828.38 | 9064.22 | 5264.74 | 12664.53 |
| 2023-06 | 13603.87 | 1679.25 | 7975.77 | 9811.36 |
| 2023-07 | 21451.04 | 17656.22 | 11545.49 | 10388.33 |
| 2023-08 | 11991.06 | 14085.33 | 8069.27 | 5691.82 |
| 2023-09 | 17727.22 | 6112.06 | 21414.78 | 12375.20 |
| 2023-10 | 12840.88 | 11471.79 | 9526.71 | 5737.27 |
| 2023-11 | 1803.23 | 13756.10 | 567.06 | 8142.28 |
| 2023-12 | 13945.16 | 1160.61 | 9417.91 | 6946.13 |
InΒ [66]:
# Region-wise Product Performance Analysis
import pandas as pd
# Load the sales data
df_sales = pd.read_csv('sales.csv')
# Group by Region and Product, summing Total Revenue and Units Sold
region_product_perf = df_sales.groupby(['Region', 'Product']).agg({'Total Revenue': 'sum', 'Units Sold': 'sum'}).reset_index()
# Sort within each region by Total Revenue descending
region_product_perf = region_product_perf.sort_values(['Region', 'Total Revenue'], ascending=[True, False])
print('Region-wise Product Performance:')
display(region_product_perf)
Region-wise Product Performance:
| Region | Product | Total Revenue | Units Sold | |
|---|---|---|---|---|
| 4 | East | Tablet | 477412 | 833 |
| 1 | East | Laptop | 351203 | 632 |
| 3 | East | Phone | 299314 | 391 |
| 0 | East | Keyboard | 270648 | 632 |
| 2 | East | Monitor | 158667 | 209 |
| 6 | North | Laptop | 653017 | 968 |
| 9 | North | Tablet | 238277 | 366 |
| 5 | North | Keyboard | 170615 | 373 |
| 8 | North | Phone | 126741 | 345 |
| 7 | North | Monitor | 96120 | 192 |
| 12 | South | Monitor | 431724 | 900 |
| 13 | South | Phone | 374465 | 539 |
| 14 | South | Tablet | 368582 | 574 |
| 11 | South | Laptop | 292531 | 566 |
| 10 | South | Keyboard | 89094 | 151 |
| 19 | West | Tablet | 576844 | 673 |
| 15 | West | Keyboard | 540270 | 936 |
| 16 | West | Laptop | 390025 | 590 |
| 17 | West | Monitor | 216463 | 765 |
| 18 | West | Phone | 38220 | 91 |
InΒ [67]:
# Visualization: Region-wise Product Performance using Seaborn
import seaborn as sns
import matplotlib.pyplot as plt
# If not already loaded, load the data and summary
df_sales = pd.read_csv('sales.csv')
region_product_perf = df_sales.groupby(['Region', 'Product']).agg({'Total Revenue': 'sum', 'Units Sold': 'sum'}).reset_index()
plt.figure(figsize=(14, 6))
sns.barplot(data=region_product_perf, x='Region', y='Total Revenue', hue='Product')
plt.title('Total Revenue by Product and Region')
plt.ylabel('Total Revenue')
plt.xlabel('Region')
plt.legend(title='Product', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 6))
sns.barplot(data=region_product_perf, x='Region', y='Units Sold', hue='Product')
plt.title('Units Sold by Product and Region')
plt.ylabel('Units Sold')
plt.xlabel('Region')
plt.legend(title='Product', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
InΒ [71]:
# Monthly Revenue Analysis
import pandas as pd
import matplotlib.pyplot as plt
df_sales = pd.read_csv('sales.csv', parse_dates=['Date'])
df_sales['Month'] = df_sales['Date'].dt.to_period('M')
monthly_revenue = df_sales.groupby('Month')['Total Revenue'].sum().reset_index()
print('Monthly Total Revenue:')
display(monthly_revenue)
plt.figure(figsize=(10,5))
plt.plot(monthly_revenue['Month'].astype(str), monthly_revenue['Total Revenue'], marker='o', color='teal')
plt.title('Monthly Total Revenue Trend')
plt.xlabel('Month')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Monthly Total Revenue:
| Month | Total Revenue | |
|---|---|---|
| 0 | 2023-01 | 1860657 |
| 1 | 2023-02 | 1772464 |
| 2 | 2023-03 | 1889127 |
| 3 | 2023-04 | 637984 |
InΒ [72]:
# Revenue Forecasting using Linear Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
df_sales = pd.read_csv('sales.csv', parse_dates=['Date'])
df_sales['Month'] = df_sales['Date'].dt.to_period('M')
monthly_revenue = df_sales.groupby('Month')['Total Revenue'].sum().reset_index()
monthly_revenue['MonthNum'] = np.arange(len(monthly_revenue))
# Prepare data for regression
X = monthly_revenue[['MonthNum']]
y = monthly_revenue['Total Revenue']
model = LinearRegression().fit(X, y)
# Forecast next 6 months
future_months = np.arange(len(monthly_revenue), len(monthly_revenue)+6)
future_dates = pd.date_range(monthly_revenue['Month'].dt.to_timestamp().max()+pd.offsets.MonthBegin(1), periods=6, freq='MS')
future_revenue = model.predict(future_months.reshape(-1,1))
# Plot actual and forecasted revenue
plt.figure(figsize=(10,5))
plt.plot(monthly_revenue['Month'].astype(str), monthly_revenue['Total Revenue'], marker='o', label='Actual Revenue')
plt.plot(future_dates.strftime('%Y-%m'), future_revenue, marker='x', linestyle='--', color='red', label='Forecasted Revenue')
plt.title('Monthly Revenue Forecast (Linear Regression)')
plt.xlabel('Month')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
# Print forecasted revenue values
forecast_df = pd.DataFrame({'Month': future_dates.strftime('%Y-%m'), 'Forecasted Revenue': future_revenue})
print(forecast_df)
C:\Users\2283297\AppData\Roaming\Python\Python313\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
Month Forecasted Revenue 0 2023-05 652219.0 1 2023-06 297083.4 2 2023-07 -58052.2 3 2023-08 -413187.8 4 2023-09 -768323.4 5 2023-10 -1123459.0
InΒ [73]:
# Identify Unusually High or Low Total Revenue Records (Outlier Detection)
import pandas as pd
df_sales = pd.read_csv('sales.csv')
# Calculate IQR for Total Revenue
Q1 = df_sales['Total Revenue'].quantile(0.25)
Q3 = df_sales['Total Revenue'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df_sales[(df_sales['Total Revenue'] < lower_bound) | (df_sales['Total Revenue'] > upper_bound)]
print('Unusually High or Low Total Revenue Records:')
display(outliers)
Unusually High or Low Total Revenue Records:
| Date | Product | Region | Units Sold | Unit Price | Total Revenue |
|---|
InΒ [74]:
# Pivot Table: Total Profit by Product Category and Region
import pandas as pd
# Load the salesone.csv data
df_salesone = pd.read_csv('salesone.csv')
# Create a pivot table: total profit by Product (category) and Region
pivot_profit = pd.pivot_table(df_salesone, values='Profit', index='Product', columns='Region', aggfunc='sum', fill_value=0)
print('Pivot Table: Total Profit by Product Category and Region')
display(pivot_profit)
Pivot Table: Total Profit by Product Category and Region
| Region | East | North | South | West |
|---|---|---|---|---|
| Product | ||||
| Keyboard | 15469 | 9774 | 5136 | 27196 |
| Laptop | 23139 | 25947 | 15656 | 20630 |
| Monitor | 7031 | 24464 | 27614 | 48774 |
| Phone | 13662 | 7775 | 26041 | 1234 |
| Tablet | 28237 | 15329 | 11324 | 19080 |
InΒ [75]:
# Compare Product-wise Revenue and Profit per Region
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the salesone.csv data
df_salesone = pd.read_csv('salesone.csv')
# Aggregate revenue and profit by product and region
agg = df_salesone.groupby(['Region', 'Product']).agg({'Total Revenue': 'sum', 'Profit': 'sum'}).reset_index()
# Melt for easier plotting
agg_melted = agg.melt(id_vars=['Region', 'Product'], value_vars=['Total Revenue', 'Profit'], var_name='Metric', value_name='Value')
# Plot grouped barplot for each region
regions = agg['Region'].unique()
for region in regions:
plt.figure(figsize=(10,5))
data = agg_melted[agg_melted['Region'] == region]
sns.barplot(data=data, x='Product', y='Value', hue='Metric')
plt.title(f'Product-wise Revenue and Profit in {region} Region')
plt.ylabel('Amount')
plt.xlabel('Product')
plt.legend(title='Metric')
plt.tight_layout()
plt.show()
InΒ [76]:
# Correlation between Profit and Other Available Columns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df_salesone = pd.read_csv('salesone.csv')
# Select relevant numerical columns
num_cols = ['Units Sold', 'Unit Price', 'Total Revenue', 'Profit']
corr_matrix = df_salesone[num_cols].corr()
print('Correlation Matrix (including Profit):')
display(corr_matrix)
# Visualize the correlation matrix
plt.figure(figsize=(6,4))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix: Profit vs Other Variables')
plt.tight_layout()
plt.show()
Correlation Matrix (including Profit):
| Units Sold | Unit Price | Total Revenue | Profit | |
|---|---|---|---|---|
| Units Sold | 1.000000 | -0.123234 | 0.639159 | 0.151937 |
| Unit Price | -0.123234 | 1.000000 | 0.609019 | -0.093946 |
| Total Revenue | 0.639159 | 0.609019 | 1.000000 | 0.009834 |
| Profit | 0.151937 | -0.093946 | 0.009834 | 1.000000 |
InΒ [80]:
# Simulate Different Unit Price Scenarios and Revenue Impact
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df_salesone = pd.read_csv('salesone.csv')
# Choose a product to simulate (or simulate for all products)
products = df_salesone['Product'].unique()
# Define price change scenarios (e.g., -20%, -10%, 0%, +10%, +20%)
price_changes = np.array([-0.2, -0.1, 0, 0.1, 0.2])
# Store results
scenario_results = []
for product in products:
prod_df = df_salesone[df_salesone['Product'] == product]
avg_units = prod_df['Units Sold'].mean()
base_price = prod_df['Unit Price'].mean()
for pct in price_changes:
new_price = base_price * (1 + pct)
# Assume units sold remains constant for simplicity
new_revenue = avg_units * new_price * len(prod_df)
scenario_results.append({'Product': product, 'Price Change %': pct*100, 'Simulated Revenue': new_revenue})
scenario_df = pd.DataFrame(scenario_results)
# Plot
plt.figure(figsize=(10,6))
for product in products:
data = scenario_df[scenario_df['Product'] == product]
plt.plot(data['Price Change %'], data['Simulated Revenue'], marker='o', label=product)
plt.title('Simulated Revenue Under Different Unit Price Scenarios')
plt.xlabel('Unit Price Change (%)')
plt.ylabel('Simulated Total Revenue')
plt.legend(title='Product')
plt.grid(True)
plt.tight_layout()
plt.show()
print('Simulated Revenue Table:')
display(scenario_df)
Simulated Revenue Table:
| Product | Price Change % | Simulated Revenue | |
|---|---|---|---|
| 0 | Phone | -20.0 | 7.107753e+05 |
| 1 | Phone | -10.0 | 7.996222e+05 |
| 2 | Phone | 0.0 | 8.884692e+05 |
| 3 | Phone | 10.0 | 9.773161e+05 |
| 4 | Phone | 20.0 | 1.066163e+06 |
| 5 | Laptop | -20.0 | 1.381920e+06 |
| 6 | Laptop | -10.0 | 1.554660e+06 |
| 7 | Laptop | 0.0 | 1.727400e+06 |
| 8 | Laptop | 10.0 | 1.900140e+06 |
| 9 | Laptop | 20.0 | 2.072879e+06 |
| 10 | Monitor | -20.0 | 8.233100e+05 |
| 11 | Monitor | -10.0 | 9.262237e+05 |
| 12 | Monitor | 0.0 | 1.029137e+06 |
| 13 | Monitor | 10.0 | 1.132051e+06 |
| 14 | Monitor | 20.0 | 1.234965e+06 |
| 15 | Keyboard | -20.0 | 9.021688e+05 |
| 16 | Keyboard | -10.0 | 1.014940e+06 |
| 17 | Keyboard | 0.0 | 1.127711e+06 |
| 18 | Keyboard | 10.0 | 1.240482e+06 |
| 19 | Keyboard | 20.0 | 1.353253e+06 |
| 20 | Tablet | -20.0 | 1.286736e+06 |
| 21 | Tablet | -10.0 | 1.447578e+06 |
| 22 | Tablet | 0.0 | 1.608420e+06 |
| 23 | Tablet | 10.0 | 1.769262e+06 |
| 24 | Tablet | 20.0 | 1.930104e+06 |
InΒ [82]:
# Simulate Unit Price Scenarios with Demand Elasticity
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df_salesone = pd.read_csv('salesone.csv')
products = df_salesone['Product'].unique()
# Define price change scenarios (e.g., -20%, -10%, 0%, +10%, +20%)
price_changes = np.array([-0.2, -0.1, 0, 0.1, 0.2])
# Assume a price elasticity of demand (negative means demand drops as price rises)
# Example: elasticity = -1.2 (for every 1% increase in price, units sold drops by 1.2%)
elasticity = -1.2
scenario_results = []
for product in products:
prod_df = df_salesone[df_salesone['Product'] == product]
avg_units = prod_df['Units Sold'].mean()
base_price = prod_df['Unit Price'].mean()
n_periods = len(prod_df)
for pct in price_changes:
new_price = base_price * (1 + pct)
# Adjust units sold based on elasticity
new_units = avg_units * (1 + elasticity * pct)
new_units = max(new_units, 0) # Units sold can't be negative
new_revenue = new_units * new_price * n_periods
scenario_results.append({'Product': product, 'Price Change %': pct*100, 'Simulated Revenue': new_revenue, 'Simulated Units Sold': new_units * n_periods})
scenario_df = pd.DataFrame(scenario_results)
# Plot revenue
plt.figure(figsize=(10,6))
for product in products:
data = scenario_df[scenario_df['Product'] == product]
plt.plot(data['Price Change %'], data['Simulated Revenue'], marker='o', label=product)
plt.title('Simulated Revenue Under Price Scenarios with Demand Elasticity')
plt.xlabel('Unit Price Change (%)')
plt.ylabel('Simulated Total Revenue')
plt.legend(title='Product')
plt.grid(True)
plt.tight_layout()
plt.show()
# Plot units sold
plt.figure(figsize=(10,6))
for product in products:
data = scenario_df[scenario_df['Product'] == product]
plt.plot(data['Price Change %'], data['Simulated Units Sold'], marker='s', label=product)
plt.title('Simulated Units Sold Under Price Scenarios with Demand Elasticity')
plt.xlabel('Unit Price Change (%)')
plt.ylabel('Simulated Total Units Sold')
plt.legend(title='Product')
plt.grid(True)
plt.tight_layout()
plt.show()
print('Simulated Revenue and Units Sold Table:')
display(scenario_df)
Simulated Revenue and Units Sold Table:
| Product | Price Change % | Simulated Revenue | Simulated Units Sold | |
|---|---|---|---|---|
| 0 | Phone | -20.0 | 8.813614e+05 | 1693.84 |
| 1 | Phone | -10.0 | 8.955769e+05 | 1529.92 |
| 2 | Phone | 0.0 | 8.884692e+05 | 1366.00 |
| 3 | Phone | 10.0 | 8.600382e+05 | 1202.08 |
| 4 | Phone | 20.0 | 8.102839e+05 | 1038.16 |
| 5 | Laptop | -20.0 | 1.713580e+06 | 3417.44 |
| 6 | Laptop | -10.0 | 1.741219e+06 | 3086.72 |
| 7 | Laptop | 0.0 | 1.727400e+06 | 2756.00 |
| 8 | Laptop | 10.0 | 1.672123e+06 | 2425.28 |
| 9 | Laptop | 20.0 | 1.575388e+06 | 2094.56 |
| 10 | Monitor | -20.0 | 1.020904e+06 | 2561.84 |
| 11 | Monitor | -10.0 | 1.037371e+06 | 2313.92 |
| 12 | Monitor | 0.0 | 1.029137e+06 | 2066.00 |
| 13 | Monitor | 10.0 | 9.962051e+05 | 1818.08 |
| 14 | Monitor | 20.0 | 9.385734e+05 | 1570.16 |
| 15 | Keyboard | -20.0 | 1.118689e+06 | 2594.08 |
| 16 | Keyboard | -10.0 | 1.136733e+06 | 2343.04 |
| 17 | Keyboard | 0.0 | 1.127711e+06 | 2092.00 |
| 18 | Keyboard | 10.0 | 1.091624e+06 | 1840.96 |
| 19 | Keyboard | 20.0 | 1.028472e+06 | 1589.92 |
| 20 | Tablet | -20.0 | 1.595552e+06 | 3033.04 |
| 21 | Tablet | -10.0 | 1.621287e+06 | 2739.52 |
| 22 | Tablet | 0.0 | 1.608420e+06 | 2446.00 |
| 23 | Tablet | 10.0 | 1.556950e+06 | 2152.48 |
| 24 | Tablet | 20.0 | 1.466879e+06 | 1858.96 |
InΒ [83]:
# Analyze Product Sales Peaks by Month
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load data
df_salesone = pd.read_csv('salesone.csv', parse_dates=['Date'], dayfirst=True)
df_salesone['Month'] = df_salesone['Date'].dt.strftime('%b')
# Group by Product and Month, sum Units Sold
monthly_product_sales = df_salesone.groupby(['Product', 'Month'])['Units Sold'].sum().reset_index()
# Order months chronologically
from pandas.api.types import CategoricalDtype
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_product_sales['Month'] = monthly_product_sales['Month'].astype(CategoricalDtype(categories=month_order, ordered=True))
plt.figure(figsize=(12,6))
sns.lineplot(data=monthly_product_sales, x='Month', y='Units Sold', hue='Product', marker='o')
plt.title('Monthly Units Sold by Product')
plt.ylabel('Units Sold')
plt.xlabel('Month')
plt.legend(title='Product')
plt.tight_layout()
plt.show()
# Show table of peak month for each product
peak_months = monthly_product_sales.loc[monthly_product_sales.groupby('Product')['Units Sold'].idxmax()][['Product', 'Month', 'Units Sold']]
print('Peak Month for Each Product:')
display(peak_months)
Peak Month for Each Product:
| Product | Month | Units Sold | |
|---|---|---|---|
| 3 | Keyboard | Mar | 720 |
| 5 | Laptop | Feb | 1035 |
| 9 | Monitor | Jan | 969 |
| 13 | Phone | Mar | 556 |
| 16 | Tablet | Jan | 1011 |
InΒ [84]:
# Estimate Reorder Point for High Demand Products
import pandas as pd
import numpy as np
from datetime import timedelta
# Load data
df_salesone = pd.read_csv('salesone.csv', parse_dates=['Date'], dayfirst=True)
# Define high demand as top 3 products by total units sold
top_products = df_salesone.groupby('Product')['Units Sold'].sum().sort_values(ascending=False).head(3).index.tolist()
# Assume a starting inventory for each product (can be customized)
starting_inventory = 500 # units
# Calculate recent average daily sales rate (last 30 days in data)
last_date = df_salesone['Date'].max()
window_start = last_date - pd.Timedelta(days=30)
recent_sales = df_salesone[df_salesone['Date'] >= window_start]
reorder_estimates = []
for product in top_products:
prod_sales = recent_sales[recent_sales['Product'] == product]
if not prod_sales.empty:
daily_rate = prod_sales['Units Sold'].sum() / 30
if daily_rate > 0:
days_left = starting_inventory / daily_rate
est_runout_date = last_date + timedelta(days=days_left)
else:
days_left = np.inf
est_runout_date = 'N/A'
else:
days_left = np.inf
est_runout_date = 'N/A'
reorder_estimates.append({'Product': product, 'Avg Daily Sales (last 30d)': round(daily_rate,2) if not np.isinf(days_left) else 0, 'Days Until Out': round(days_left,1) if not np.isinf(days_left) else 'N/A', 'Est Runout Date': est_runout_date})
reorder_df = pd.DataFrame(reorder_estimates)
print('Reorder Estimates for High Demand Products (Assuming 500 units in stock):')
display(reorder_df)
Reorder Estimates for High Demand Products (Assuming 500 units in stock):
| Product | Avg Daily Sales (last 30d) | Days Until Out | Est Runout Date | |
|---|---|---|---|---|
| 0 | Laptop | 34.20 | 14.6 | 2023-04-24 14:52:37.894737 |
| 1 | Tablet | 25.57 | 19.6 | 2023-04-29 13:21:40.130378 |
| 2 | Keyboard | 21.83 | 22.9 | 2023-05-02 21:37:05.954198 |
InΒ [85]:
# Compare Average Sales per Product in Each Region
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df_salesone = pd.read_csv('salesone.csv')
# Calculate average sales (units sold) per product in each region
avg_sales = df_salesone.groupby(['Region', 'Product'])['Units Sold'].mean().unstack()
print('Average Units Sold per Product in Each Region:')
display(avg_sales)
# Visualize as a heatmap
plt.figure(figsize=(10,6))
sns.heatmap(avg_sales, annot=True, fmt='.1f', cmap='YlGnBu')
plt.title('Average Units Sold per Product in Each Region')
plt.ylabel('Region')
plt.xlabel('Product')
plt.tight_layout()
plt.show()
Average Units Sold per Product in Each Region:
| Product | Keyboard | Laptop | Monitor | Phone | Tablet |
|---|---|---|---|---|---|
| Region | |||||
| East | 126.400000 | 105.333333 | 69.666667 | 78.20 | 119.000000 |
| North | 124.333333 | 96.800000 | 64.000000 | 172.50 | 91.500000 |
| South | 75.500000 | 141.500000 | 112.500000 | 134.75 | 143.500000 |
| West | 133.714286 | 84.285714 | 85.000000 | 91.00 | 112.166667 |
InΒ [86]:
# Revenue per Unit Sold Efficiency Matrix (Region vs Product)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df_salesone = pd.read_csv('salesone.csv')
# Calculate revenue per unit sold for each product in each region
rev_per_unit = df_salesone.groupby(['Region', 'Product']).apply(lambda x: x['Total Revenue'].sum() / x['Units Sold'].sum() if x['Units Sold'].sum() > 0 else 0).unstack()
print('Revenue per Unit Sold Efficiency Matrix:')
display(rev_per_unit)
# Visualize as a heatmap
plt.figure(figsize=(10,6))
sns.heatmap(rev_per_unit, annot=True, fmt='.2f', cmap='OrRd')
plt.title('Revenue per Unit Sold Efficiency (Region vs Product)')
plt.ylabel('Region')
plt.xlabel('Product')
plt.tight_layout()
plt.show()
Revenue per Unit Sold Efficiency Matrix:
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\390871768.py:9: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. rev_per_unit = df_salesone.groupby(['Region', 'Product']).apply(lambda x: x['Total Revenue'].sum() / x['Units Sold'].sum() if x['Units Sold'].sum() > 0 else 0).unstack()
| Product | Keyboard | Laptop | Monitor | Phone | Tablet |
|---|---|---|---|---|---|
| Region | |||||
| East | 428.240506 | 555.700949 | 759.172249 | 765.508951 | 573.123649 |
| North | 457.412869 | 674.604339 | 500.625000 | 367.365217 | 651.030055 |
| South | 590.026490 | 516.839223 | 479.693333 | 694.740260 | 642.128920 |
| West | 577.211538 | 661.059322 | 282.958170 | 420.000000 | 857.123328 |
InΒ [87]:
# Highlight Periods When Revenue Dropped from Previous Day
import pandas as pd
import matplotlib.pyplot as plt
# Load data
df_salesone = pd.read_csv('salesone.csv', parse_dates=['Date'], dayfirst=True)
# Aggregate total revenue by day
daily_revenue = df_salesone.groupby('Date')['Total Revenue'].sum().sort_index().reset_index()
# Calculate revenue change from previous day
daily_revenue['Revenue Change'] = daily_revenue['Total Revenue'].diff()
daily_revenue['Drop'] = daily_revenue['Revenue Change'] < 0
# Highlight drops on the plot
plt.figure(figsize=(12,6))
plt.plot(daily_revenue['Date'], daily_revenue['Total Revenue'], marker='o', label='Total Revenue')
plt.scatter(daily_revenue.loc[daily_revenue['Drop'], 'Date'],
daily_revenue.loc[daily_revenue['Drop'], 'Total Revenue'],
color='red', label='Revenue Drop', zorder=5)
plt.title('Daily Total Revenue with Drops Highlighted')
plt.xlabel('Date')
plt.ylabel('Total Revenue')
plt.legend()
plt.tight_layout()
plt.show()
# Show table of drops
drops = daily_revenue[daily_revenue['Drop']][['Date', 'Total Revenue', 'Revenue Change']]
print('Days When Revenue Dropped from Previous Day:')
display(drops)
Days When Revenue Dropped from Previous Day:
| Date | Total Revenue | Revenue Change | |
|---|---|---|---|
| 1 | 2023-01-03 | 131407 | -201290.0 |
| 2 | 2023-01-04 | 111398 | -20009.0 |
| 3 | 2023-01-06 | 50710 | -60688.0 |
| 5 | 2023-01-08 | 7280 | -46185.0 |
| 7 | 2023-01-14 | 134735 | -44232.0 |
| 9 | 2023-01-18 | 27504 | -193955.0 |
| 11 | 2023-01-22 | 86161 | -147637.0 |
| 14 | 2023-02-02 | 94248 | -74802.0 |
| 16 | 2023-02-04 | 46020 | -97620.0 |
| 19 | 2023-02-08 | 52864 | -50757.0 |
| 20 | 2023-02-09 | 15142 | -37722.0 |
| 22 | 2023-02-13 | 127970 | -6064.0 |
| 24 | 2023-02-17 | 75650 | -77007.0 |
| 25 | 2023-02-18 | 55125 | -20525.0 |
| 26 | 2023-02-19 | 11674 | -43451.0 |
| 28 | 2023-02-21 | 57663 | -108830.0 |
| 31 | 2023-02-24 | 21600 | -133938.0 |
| 33 | 2023-02-28 | 98802 | -8606.0 |
| 35 | 2023-03-02 | 27432 | -226024.0 |
| 37 | 2023-03-04 | 33810 | -302633.0 |
| 39 | 2023-03-12 | 24827 | -168333.0 |
| 41 | 2023-03-14 | 30876 | -60909.0 |
| 42 | 2023-03-15 | 22776 | -8100.0 |
| 44 | 2023-03-17 | 20925 | -90158.0 |
| 45 | 2023-03-19 | 17181 | -3744.0 |
| 47 | 2023-03-22 | 42944 | -50840.0 |
| 48 | 2023-03-23 | 28731 | -14213.0 |
| 49 | 2023-03-24 | 27645 | -1086.0 |
| 50 | 2023-03-25 | 21360 | -6285.0 |
| 54 | 2023-03-30 | 57208 | -110323.0 |
| 58 | 2023-04-03 | 106215 | -74882.0 |
| 59 | 2023-04-05 | 26533 | -79682.0 |
InΒ [88]:
# Visualize Region Contribution in Units Sold Across Products
import pandas as pd
import matplotlib.pyplot as plt
df_salesone = pd.read_csv('salesone.csv')
# Pivot table: products as x, regions as stacked bars
units_by_region_product = df_salesone.pivot_table(index='Product', columns='Region', values='Units Sold', aggfunc='sum', fill_value=0)
# Plot stacked bar chart
units_by_region_product.plot(kind='bar', stacked=True, figsize=(12,6), colormap='tab20')
plt.title('Region Contribution in Units Sold Across Products')
plt.ylabel('Units Sold')
plt.xlabel('Product')
plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
InΒ [89]:
# Analyze Top Product Combinations Contributing to Revenue by Region
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
# Load data
df_salesone = pd.read_csv('salesone.csv')
# For each region, find top 3 product pairs by total revenue
regions = df_salesone['Region'].unique()
combo_results = []
for region in regions:
reg_df = df_salesone[df_salesone['Region'] == region]
products = reg_df['Product'].unique()
# All possible product pairs
for prod1, prod2 in combinations(products, 2):
combo_revenue = reg_df[reg_df['Product'].isin([prod1, prod2])]['Total Revenue'].sum()
combo_results.append({'Region': region, 'Product Combo': f'{prod1} + {prod2}', 'Total Revenue': combo_revenue})
combo_df = pd.DataFrame(combo_results)
# For each region, get top 3 combos
top_combos = combo_df.sort_values(['Region', 'Total Revenue'], ascending=[True, False]).groupby('Region').head(3)
print('Top Product Combinations by Revenue for Each Region:')
display(top_combos)
# Visualize
plt.figure(figsize=(12,6))
sns.barplot(data=top_combos, x='Region', y='Total Revenue', hue='Product Combo')
plt.title('Top Product Combinations by Revenue for Each Region')
plt.ylabel('Total Revenue')
plt.xlabel('Region')
plt.legend(title='Product Combo', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
Top Product Combinations by Revenue for Each Region:
| Region | Product Combo | Total Revenue | |
|---|---|---|---|
| 11 | East | Laptop + Tablet | 828615 |
| 17 | East | Tablet + Phone | 776726 |
| 18 | East | Tablet + Keyboard | 748060 |
| 27 | North | Laptop + Tablet | 891294 |
| 24 | North | Keyboard + Laptop | 823632 |
| 28 | North | Laptop + Phone | 779758 |
| 0 | South | Phone + Monitor | 806189 |
| 5 | South | Monitor + Tablet | 800306 |
| 2 | South | Phone + Tablet | 743047 |
| 31 | West | Keyboard + Tablet | 1117114 |
| 37 | West | Tablet + Laptop | 966869 |
| 32 | West | Keyboard + Laptop | 930295 |
InΒ [90]:
# Find Most Profitable Product in Each Region
import pandas as pd
df_salesone = pd.read_csv('salesone.csv')
# Group by Region and Product, sum Profit
profit_by_region_product = df_salesone.groupby(['Region', 'Product'])['Profit'].sum().reset_index()
# For each region, find the product with the highest profit
idx = profit_by_region_product.groupby('Region')['Profit'].idxmax()
most_profitable = profit_by_region_product.loc[idx].reset_index(drop=True)
print('Most Profitable Product in Each Region:')
display(most_profitable)
Most Profitable Product in Each Region:
| Region | Product | Profit | |
|---|---|---|---|
| 0 | East | Tablet | 28237 |
| 1 | North | Laptop | 25947 |
| 2 | South | Monitor | 27614 |
| 3 | West | Monitor | 48774 |
InΒ [91]:
# Find Which Day of the Week Yields the Highest Revenue
import pandas as pd
df_salesone = pd.read_csv('salesone.csv', parse_dates=['Date'], dayfirst=True)
df_salesone['DayOfWeek'] = df_salesone['Date'].dt.day_name()
# Group by day of week and sum revenue
dow_revenue = df_salesone.groupby('DayOfWeek')['Total Revenue'].sum().reset_index()
# Order days of week
from pandas.api.types import CategoricalDtype
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_revenue['DayOfWeek'] = dow_revenue['DayOfWeek'].astype(CategoricalDtype(categories=days_order, ordered=True))
dow_revenue = dow_revenue.sort_values('DayOfWeek')
print('Total Revenue by Day of the Week:')
display(dow_revenue)
# Find the day with the highest revenue
max_day = dow_revenue.loc[dow_revenue['Total Revenue'].idxmax()]
print(f"\nDay with Highest Revenue: {max_day['DayOfWeek']} (Revenue: {max_day['Total Revenue']})")
Total Revenue by Day of the Week:
| DayOfWeek | Total Revenue | |
|---|---|---|
| 1 | Monday | 1499632 |
| 5 | Tuesday | 793525 |
| 6 | Wednesday | 800307 |
| 4 | Thursday | 642039 |
| 0 | Friday | 770045 |
| 2 | Saturday | 817439 |
| 3 | Sunday | 837245 |
Day with Highest Revenue: Monday (Revenue: 1499632)
InΒ [92]:
# Track How Revenue Grows Over Time
import pandas as pd
import matplotlib.pyplot as plt
# Load data
df_salesone = pd.read_csv('salesone.csv', parse_dates=['Date'], dayfirst=True)
# Aggregate daily revenue and calculate cumulative sum
daily_revenue = df_salesone.groupby('Date')['Total Revenue'].sum().sort_index().reset_index()
daily_revenue['Cumulative Revenue'] = daily_revenue['Total Revenue'].cumsum()
plt.figure(figsize=(12,6))
plt.plot(daily_revenue['Date'], daily_revenue['Cumulative Revenue'], marker='o', color='teal')
plt.title('Cumulative Revenue Growth Over Time')
plt.xlabel('Date')
plt.ylabel('Cumulative Revenue')
plt.tight_layout()
plt.show()
print('Cumulative Revenue Table:')
display(daily_revenue)
Cumulative Revenue Table:
| Date | Total Revenue | Cumulative Revenue | |
|---|---|---|---|
| 0 | 2023-01-02 | 332697 | 332697 |
| 1 | 2023-01-03 | 131407 | 464104 |
| 2 | 2023-01-04 | 111398 | 575502 |
| 3 | 2023-01-06 | 50710 | 626212 |
| 4 | 2023-01-07 | 53465 | 679677 |
| ... | ... | ... | ... |
| 56 | 2023-04-01 | 105092 | 5627340 |
| 57 | 2023-04-02 | 181097 | 5808437 |
| 58 | 2023-04-03 | 106215 | 5914652 |
| 59 | 2023-04-05 | 26533 | 5941185 |
| 60 | 2023-04-10 | 219047 | 6160232 |
61 rows Γ 3 columns
InΒ [94]:
# Filter and Display Only Rows Where Units Sold is in Top 10%
import pandas as pd
df_salesone = pd.read_csv('salesone.csv')
# Calculate the 90th percentile threshold
threshold = df_salesone['Units Sold'].quantile(0.9)
# Filter rows where units sold is in the top 10%
top10_df = df_salesone[df_salesone['Units Sold'] >= threshold]
print(f"Rows Where Units Sold is in the Top 10% (Threshold: {threshold}):")
display(top10_df[['Date', 'Product', 'Region', 'Units Sold']])
Rows Where Units Sold is in the Top 10% (Threshold: 173.20000000000002):
| Date | Product | Region | Units Sold | |
|---|---|---|---|---|
| 18 | 30-01-2023 | Tablet | North | 175 |
| 31 | 28-02-2023 | Tablet | East | 198 |
| 32 | 11-02-2023 | Monitor | East | 178 |
| 46 | 07-01-2023 | Monitor | West | 185 |
| 47 | 21-01-2023 | Keyboard | West | 196 |
| 55 | 09-01-2023 | Tablet | East | 191 |
| 71 | 04-01-2023 | Monitor | West | 181 |
| 82 | 03-02-2023 | Keyboard | East | 190 |
| 85 | 10-04-2023 | Tablet | South | 197 |
| 97 | 23-03-2023 | Phone | North | 183 |
InΒ [95]:
# Visualize Top 10% Units Sold Records by Product and Region
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df_salesone = pd.read_csv('salesone.csv')
threshold = df_salesone['Units Sold'].quantile(0.9)
top10_df = df_salesone[df_salesone['Units Sold'] >= threshold]
plt.figure(figsize=(12,6))
sns.barplot(data=top10_df, x='Product', y='Units Sold', hue='Region')
plt.title('Top 10% Units Sold Records by Product and Region')
plt.ylabel('Units Sold')
plt.xlabel('Product')
plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
InΒ [96]:
# Show Shortfall from a Monthly Revenue Target
import pandas as pd
import matplotlib.pyplot as plt
# Load data
df_salesone = pd.read_csv('salesone.csv', parse_dates=['Date'], dayfirst=True)
df_salesone['Month'] = df_salesone['Date'].dt.to_period('M')
# Set monthly revenue target (customize as needed)
monthly_target = 200000 # Example target
# Calculate monthly revenue
monthly_revenue = df_salesone.groupby('Month')['Total Revenue'].sum().reset_index()
monthly_revenue['Shortfall'] = monthly_target - monthly_revenue['Total Revenue']
monthly_revenue['Shortfall'] = monthly_revenue['Shortfall'].clip(lower=0)
print(f'Monthly Revenue Target: {monthly_target}')
display(monthly_revenue)
# Visualize shortfall
plt.figure(figsize=(10,5))
plt.bar(monthly_revenue['Month'].astype(str), monthly_revenue['Shortfall'], color='crimson')
plt.title('Monthly Revenue Shortfall from Target')
plt.xlabel('Month')
plt.ylabel('Shortfall')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Monthly Revenue Target: 200000
| Month | Total Revenue | Shortfall | |
|---|---|---|---|
| 0 | 2023-01 | 1860657 | 0 |
| 1 | 2023-02 | 1772464 | 0 |
| 2 | 2023-03 | 1889127 | 0 |
| 3 | 2023-04 | 637984 | 0 |
InΒ [97]:
# Check How Each Product Deviates from Its Average Revenue
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df_salesone = pd.read_csv('salesone.csv')
# Calculate average revenue per product
avg_revenue = df_salesone.groupby('Product')['Total Revenue'].mean().rename('Avg Revenue').reset_index()
# Merge with original data
df_merged = df_salesone.merge(avg_revenue, on='Product')
df_merged['Revenue Deviation'] = df_merged['Total Revenue'] - df_merged['Avg Revenue']
print('Sample of Revenue Deviation per Product:')
display(df_merged[['Date', 'Product', 'Region', 'Total Revenue', 'Avg Revenue', 'Revenue Deviation']].head())
# Visualize deviation for each product
plt.figure(figsize=(12,6))
sns.boxplot(data=df_merged, x='Product', y='Revenue Deviation', showmeans=True)
plt.title('Revenue Deviation from Product Average')
plt.ylabel('Deviation from Average Revenue')
plt.xlabel('Product')
plt.tight_layout()
plt.show()
Sample of Revenue Deviation per Product:
| Date | Product | Region | Total Revenue | Avg Revenue | Revenue Deviation | |
|---|---|---|---|---|---|---|
| 0 | 21-02-2023 | Phone | South | 57663 | 69895.000000 | -12232.000000 |
| 1 | 03-04-2023 | Laptop | East | 43838 | 62473.185185 | -18635.185185 |
| 2 | 15-01-2023 | Monitor | South | 128778 | 39259.739130 | 89518.260870 |
| 3 | 13-03-2023 | Monitor | East | 13284 | 39259.739130 | -25975.739130 |
| 4 | 02-03-2023 | Monitor | North | 27432 | 39259.739130 | -11827.739130 |
InΒ [98]:
# Auto-Detect Missing or Inconsistent Data
import pandas as pd
df_salesone = pd.read_csv('salesone.csv')
# Check for missing values
display(df_salesone.isnull().sum().to_frame('Missing Values'))
# Check for negative or zero values in columns that should be positive
num_cols = ['Units Sold', 'Unit Price', 'Total Revenue', 'Profit']
for col in num_cols:
invalid = df_salesone[df_salesone[col] <= 0]
if not invalid.empty:
print(f'Rows with non-positive values in {col}:')
display(invalid)
# Check for duplicate rows
duplicates = df_salesone[df_salesone.duplicated()]
if not duplicates.empty:
print('Duplicate Rows Detected:')
display(duplicates)
else:
print('No duplicate rows detected.')
| Missing Values | |
|---|---|
| Date | 0 |
| Product | 0 |
| Region | 0 |
| Units Sold | 0 |
| Unit Price | 0 |
| Total Revenue | 0 |
| Profit | 0 |
| Discount | 0 |
No duplicate rows detected.
InΒ [99]:
# Flag Sudden Day-over-Day Revenue Drop > 30%
import pandas as pd
# Load data
df_salesone = pd.read_csv('salesone.csv', parse_dates=['Date'], dayfirst=True)
# Aggregate daily revenue
daily_revenue = df_salesone.groupby('Date')['Total Revenue'].sum().sort_index().reset_index()
# Calculate percent change from previous day
daily_revenue['Pct_Change'] = daily_revenue['Total Revenue'].pct_change()
# Flag days with >30% drop
daily_revenue['Flag_Drop_Over_30pct'] = daily_revenue['Pct_Change'] < -0.3
# Show flagged days
flagged = daily_revenue[daily_revenue['Flag_Drop_Over_30pct']][['Date', 'Total Revenue', 'Pct_Change']]
print('Days with Revenue Drop > 30% from Previous Day:')
display(flagged)
Days with Revenue Drop > 30% from Previous Day:
| Date | Total Revenue | Pct_Change | |
|---|---|---|---|
| 1 | 2023-01-03 | 131407 | -0.605025 |
| 3 | 2023-01-06 | 50710 | -0.544785 |
| 5 | 2023-01-08 | 7280 | -0.863836 |
| 9 | 2023-01-18 | 27504 | -0.875805 |
| 11 | 2023-01-22 | 86161 | -0.631472 |
| 14 | 2023-02-02 | 94248 | -0.442484 |
| 16 | 2023-02-04 | 46020 | -0.679616 |
| 19 | 2023-02-08 | 52864 | -0.489833 |
| 20 | 2023-02-09 | 15142 | -0.713567 |
| 24 | 2023-02-17 | 75650 | -0.504445 |
| 26 | 2023-02-19 | 11674 | -0.788227 |
| 28 | 2023-02-21 | 57663 | -0.653661 |
| 31 | 2023-02-24 | 21600 | -0.861127 |
| 35 | 2023-03-02 | 27432 | -0.891768 |
| 37 | 2023-03-04 | 33810 | -0.899507 |
| 39 | 2023-03-12 | 24827 | -0.871469 |
| 41 | 2023-03-14 | 30876 | -0.663605 |
| 44 | 2023-03-17 | 20925 | -0.811627 |
| 47 | 2023-03-22 | 42944 | -0.542097 |
| 48 | 2023-03-23 | 28731 | -0.330966 |
| 54 | 2023-03-30 | 57208 | -0.658523 |
| 58 | 2023-04-03 | 106215 | -0.413491 |
| 59 | 2023-04-05 | 26533 | -0.750195 |
InΒ [102]:
# Suggest Reasons for Unit Price * Units Sold Not Equal to Total Revenue
import pandas as pd
df_salesone = pd.read_csv('salesone.csv')
expected_revenue = df_salesone['Unit Price'] * df_salesone['Units Sold']
df_salesone['Revenue_Match'] = abs(df_salesone['Total Revenue'] - expected_revenue) < 1e-2
mismatches = df_salesone[~df_salesone['Revenue_Match']]
if mismatches.empty:
print('All rows: Unit Price * Units Sold equals Total Revenue (within rounding tolerance).')
else:
print('Rows where Unit Price * Units Sold does NOT equal Total Revenue:')
display(mismatches[['Date', 'Product', 'Region', 'Units Sold', 'Unit Price', 'Total Revenue']])
print('\nPossible reasons for mismatch:')
print('- Discounts or promotions applied to sales')
print('- Returns or refunds processed')
print('- Data entry errors or rounding issues')
print('- Bundled sales or package deals')
print('- Taxes, fees, or additional charges not reflected in unit price')
All rows: Unit Price * Units Sold equals Total Revenue (within rounding tolerance).
InΒ [103]:
# Heatmap of Product Demand by Region
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df_salesone = pd.read_csv('salesone.csv')
# Pivot table: products as columns, regions as rows, values as total units sold
demand_matrix = df_salesone.pivot_table(index='Region', columns='Product', values='Units Sold', aggfunc='sum', fill_value=0)
print('Product Demand (Units Sold) by Region:')
display(demand_matrix)
plt.figure(figsize=(10,6))
sns.heatmap(demand_matrix, annot=True, fmt='.0f', cmap='YlOrRd')
plt.title('Heatmap of Product Demand by Region')
plt.ylabel('Region')
plt.xlabel('Product')
plt.tight_layout()
plt.show()
Product Demand (Units Sold) by Region:
| Product | Keyboard | Laptop | Monitor | Phone | Tablet |
|---|---|---|---|---|---|
| Region | |||||
| East | 632 | 632 | 209 | 391 | 833 |
| North | 373 | 968 | 192 | 345 | 366 |
| South | 151 | 566 | 900 | 539 | 574 |
| West | 936 | 590 | 765 | 91 | 673 |
InΒ [104]:
# Heatmaps of Product Demand by Region: Aggregated by Month and by Revenue
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load data
df_salesone = pd.read_csv('salesone.csv', parse_dates=['Date'], dayfirst=True)
df_salesone['Month'] = df_salesone['Date'].dt.strftime('%b')
# 1. Heatmap: Units Sold by Region and Month
units_month_region = df_salesone.pivot_table(index='Region', columns='Month', values='Units Sold', aggfunc='sum', fill_value=0)
plt.figure(figsize=(10,6))
sns.heatmap(units_month_region, annot=True, fmt='.0f', cmap='Blues')
plt.title('Units Sold by Region and Month')
plt.ylabel('Region')
plt.xlabel('Month')
plt.tight_layout()
plt.show()
# 2. Heatmap: Total Revenue by Region and Product
revenue_region_product = df_salesone.pivot_table(index='Region', columns='Product', values='Total Revenue', aggfunc='sum', fill_value=0)
plt.figure(figsize=(10,6))
sns.heatmap(revenue_region_product, annot=True, fmt='.0f', cmap='Greens')
plt.title('Total Revenue by Region and Product')
plt.ylabel('Region')
plt.xlabel('Product')
plt.tight_layout()
plt.show()
InΒ [105]:
# Compare Revenue vs Units Sold Over Time Using Dual Axis Plot
import pandas as pd
import matplotlib.pyplot as plt
# Load data
df_salesone = pd.read_csv('salesone.csv', parse_dates=['Date'], dayfirst=True)
# Aggregate daily revenue and units sold
daily = df_salesone.groupby('Date').agg({'Total Revenue': 'sum', 'Units Sold': 'sum'}).sort_index().reset_index()
fig, ax1 = plt.subplots(figsize=(12,6))
color = 'tab:blue'
ax1.set_xlabel('Date')
ax1.set_ylabel('Total Revenue', color=color)
ax1.plot(daily['Date'], daily['Total Revenue'], color=color, marker='o', label='Total Revenue')
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Units Sold', color=color)
ax2.plot(daily['Date'], daily['Units Sold'], color=color, marker='s', label='Units Sold')
ax2.tick_params(axis='y', labelcolor=color)
plt.title('Revenue vs Units Sold Over Time (Dual Axis)')
fig.tight_layout()
plt.show()
InΒ [106]:
# Forecast Revenue Using Polynomial Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# Load data
df_salesone = pd.read_csv('salesone.csv', parse_dates=['Date'], dayfirst=True)
df_salesone['Month'] = df_salesone['Date'].dt.to_period('M')
monthly_revenue = df_salesone.groupby('Month')['Total Revenue'].sum().reset_index()
monthly_revenue['MonthNum'] = np.arange(len(monthly_revenue))
# Prepare data for polynomial regression
X = monthly_revenue[['MonthNum']]
y = monthly_revenue['Total Revenue']
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
model = LinearRegression().fit(X_poly, y)
# Forecast next 6 months
future_months = np.arange(len(monthly_revenue), len(monthly_revenue)+6)
future_dates = pd.date_range(monthly_revenue['Month'].dt.to_timestamp().max()+pd.offsets.MonthBegin(1), periods=6, freq='MS')
future_X_poly = poly.transform(future_months.reshape(-1,1))
future_revenue = model.predict(future_X_poly)
# Plot actual and forecasted revenue
plt.figure(figsize=(10,5))
plt.plot(monthly_revenue['Month'].astype(str), monthly_revenue['Total Revenue'], marker='o', label='Actual Revenue')
plt.plot(future_dates.strftime('%Y-%m'), future_revenue, marker='x', linestyle='--', color='red', label='Forecasted Revenue (Poly)')
plt.title('Monthly Revenue Forecast (Polynomial Regression)')
plt.xlabel('Month')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
# Print forecasted revenue values
forecast_df = pd.DataFrame({'Month': future_dates.strftime('%Y-%m'), 'Forecasted Revenue': future_revenue})
print(forecast_df)
C:\Users\2283297\AppData\Roaming\Python\Python313\site-packages\sklearn\utils\validation.py:2749: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn(
Month Forecasted Revenue 0 2023-05 -801468.5 1 2023-06 -2901029.1 2 2023-07 -5582064.7 3 2023-08 -8844575.3 4 2023-09 -12688560.9 5 2023-10 -17114021.5
InΒ [107]:
# Estimate Revenue Lost if Units Sold Was Capped Due to Inventory Shortage
import pandas as pd
# Load data
df_salesone = pd.read_csv('salesone.csv')
# Set hypothetical inventory cap per sale (customize as needed)
unit_cap = 100
# Calculate capped units sold and capped revenue
capped_units = df_salesone['Units Sold'].clip(upper=unit_cap)
df_salesone['Capped Revenue'] = capped_units * df_salesone['Unit Price']
# Calculate revenue lost due to cap
revenue_lost = df_salesone['Total Revenue'] - df_salesone['Capped Revenue']
df_salesone['Revenue Lost'] = revenue_lost.clip(lower=0)
total_lost = df_salesone['Revenue Lost'].sum()
print(f"Total Revenue Lost Due to Inventory Cap of {unit_cap} Units per Sale: {total_lost:.2f}")
display(df_salesone[df_salesone['Revenue Lost'] > 0][['Date', 'Product', 'Region', 'Units Sold', 'Unit Price', 'Total Revenue', 'Capped Revenue', 'Revenue Lost']])
Total Revenue Lost Due to Inventory Cap of 100 Units per Sale: 1540695.00
| Date | Product | Region | Units Sold | Unit Price | Total Revenue | Capped Revenue | Revenue Lost | |
|---|---|---|---|---|---|---|---|---|
| 0 | 21-02-2023 | Phone | South | 149 | 387 | 57663 | 38700 | 18963 |
| 2 | 15-01-2023 | Monitor | South | 169 | 762 | 128778 | 76200 | 52578 |
| 4 | 02-03-2023 | Monitor | North | 108 | 254 | 27432 | 25400 | 2032 |
| 5 | 21-01-2023 | Keyboard | North | 156 | 589 | 91884 | 58900 | 32984 |
| 7 | 28-03-2023 | Keyboard | West | 140 | 884 | 123760 | 88400 | 35360 |
| 8 | 16-03-2023 | Keyboard | North | 157 | 203 | 31871 | 20300 | 11571 |
| 9 | 16-03-2023 | Laptop | East | 161 | 492 | 79212 | 49200 | 30012 |
| 11 | 10-04-2023 | Laptop | South | 129 | 345 | 44505 | 34500 | 10005 |
| 12 | 24-01-2023 | Laptop | South | 170 | 275 | 46750 | 27500 | 19250 |
| 13 | 03-01-2023 | Monitor | West | 161 | 138 | 22218 | 13800 | 8418 |
| 14 | 22-01-2023 | Tablet | South | 125 | 576 | 72000 | 57600 | 14400 |
| 16 | 02-01-2023 | Laptop | North | 122 | 858 | 104676 | 85800 | 18876 |
| 17 | 29-03-2023 | Tablet | West | 173 | 637 | 110201 | 63700 | 46501 |
| 18 | 30-01-2023 | Tablet | North | 175 | 966 | 169050 | 96600 | 72450 |
| 19 | 07-02-2023 | Laptop | North | 113 | 917 | 103621 | 91700 | 11921 |
| 21 | 05-03-2023 | Keyboard | South | 121 | 624 | 75504 | 62400 | 13104 |
| 22 | 01-03-2023 | Phone | South | 108 | 927 | 100116 | 92700 | 7416 |
| 23 | 21-01-2023 | Phone | North | 162 | 605 | 98010 | 60500 | 37510 |
| 24 | 02-02-2023 | Laptop | West | 102 | 924 | 94248 | 92400 | 1848 |
| 25 | 17-03-2023 | Monitor | South | 155 | 135 | 20925 | 13500 | 7425 |
| 26 | 27-02-2023 | Laptop | West | 137 | 784 | 107408 | 78400 | 29008 |
| 27 | 22-01-2023 | Monitor | South | 119 | 119 | 14161 | 11900 | 2261 |
| 30 | 01-04-2023 | Keyboard | West | 172 | 611 | 105092 | 61100 | 43992 |
| 31 | 28-02-2023 | Tablet | East | 198 | 499 | 98802 | 49900 | 48902 |
| 32 | 11-02-2023 | Monitor | East | 178 | 753 | 134034 | 75300 | 58734 |
| 33 | 02-04-2023 | Tablet | West | 170 | 982 | 166940 | 98200 | 68740 |
| 36 | 15-01-2023 | Tablet | East | 151 | 191 | 28841 | 19100 | 9741 |
| 39 | 16-02-2023 | Laptop | North | 157 | 899 | 141143 | 89900 | 51243 |
| 40 | 03-03-2023 | Keyboard | West | 137 | 826 | 113162 | 82600 | 30562 |
| 41 | 20-02-2023 | Phone | South | 145 | 953 | 138185 | 95300 | 42885 |
| 42 | 24-02-2023 | Tablet | South | 144 | 150 | 21600 | 15000 | 6600 |
| 43 | 05-03-2023 | Laptop | South | 154 | 764 | 117656 | 76400 | 41256 |
| 44 | 03-01-2023 | Phone | East | 137 | 797 | 109189 | 79700 | 29489 |
| 46 | 07-01-2023 | Monitor | West | 185 | 289 | 53465 | 28900 | 24565 |
| 47 | 21-01-2023 | Keyboard | West | 196 | 224 | 43904 | 22400 | 21504 |
| 48 | 14-03-2023 | Tablet | North | 124 | 249 | 30876 | 24900 | 5976 |
| 49 | 08-02-2023 | Laptop | North | 128 | 413 | 52864 | 41300 | 11564 |
| 51 | 04-01-2023 | Keyboard | East | 167 | 441 | 73647 | 44100 | 29547 |
| 53 | 01-03-2023 | Keyboard | West | 118 | 791 | 93338 | 79100 | 14238 |
| 55 | 09-01-2023 | Tablet | East | 191 | 937 | 178967 | 93700 | 85267 |
| 61 | 01-03-2023 | Monitor | South | 106 | 152 | 16112 | 15200 | 912 |
| 63 | 13-02-2023 | Phone | East | 120 | 719 | 86280 | 71900 | 14380 |
| 66 | 04-02-2023 | Keyboard | West | 156 | 295 | 46020 | 29500 | 16520 |
| 71 | 04-01-2023 | Monitor | West | 181 | 146 | 26426 | 14600 | 11826 |
| 72 | 02-01-2023 | Tablet | West | 167 | 966 | 161322 | 96600 | 64722 |
| 74 | 23-02-2023 | Monitor | South | 126 | 919 | 115794 | 91900 | 23894 |
| 76 | 23-02-2023 | Laptop | North | 108 | 368 | 39744 | 36800 | 2944 |
| 77 | 03-04-2023 | Laptop | East | 133 | 469 | 62377 | 46900 | 15477 |
| 80 | 31-03-2023 | Laptop | North | 102 | 769 | 78438 | 76900 | 1538 |
| 82 | 03-02-2023 | Keyboard | East | 190 | 756 | 143640 | 75600 | 68040 |
| 83 | 15-03-2023 | Laptop | East | 104 | 219 | 22776 | 21900 | 876 |
| 84 | 03-03-2023 | Tablet | South | 108 | 930 | 100440 | 93000 | 7440 |
| 85 | 10-04-2023 | Tablet | South | 197 | 886 | 174542 | 88600 | 85942 |
| 86 | 14-01-2023 | Laptop | East | 125 | 703 | 87875 | 70300 | 17575 |
| 87 | 05-04-2023 | Keyboard | East | 169 | 157 | 26533 | 15700 | 10833 |
| 88 | 17-02-2023 | Laptop | West | 170 | 445 | 75650 | 44500 | 31150 |
| 90 | 13-03-2023 | Phone | South | 137 | 573 | 78501 | 57300 | 21201 |
| 96 | 21-03-2023 | Laptop | South | 113 | 740 | 83620 | 74000 | 9620 |
| 97 | 23-03-2023 | Phone | North | 183 | 157 | 28731 | 15700 | 13031 |
| 99 | 24-01-2023 | Monitor | South | 123 | 612 | 75276 | 61200 | 14076 |
InΒ [108]:
# Suggest Optimal Pricing by Analyzing Price Points Driving Highest Revenue
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load data
df_salesone = pd.read_csv('salesone.csv')
# Bin unit prices for each product (bin width can be adjusted)
bin_width = 50
products = df_salesone['Product'].unique()
optimal_prices = []
for product in products:
prod_df = df_salesone[df_salesone['Product'] == product]
# Create price bins
min_price, max_price = prod_df['Unit Price'].min(), prod_df['Unit Price'].max()
bins = range(int(min_price), int(max_price) + bin_width, bin_width)
prod_df['Price Bin'] = pd.cut(prod_df['Unit Price'], bins=bins, include_lowest=True)
# Aggregate revenue by price bin
bin_revenue = prod_df.groupby('Price Bin')['Total Revenue'].sum().reset_index()
# Find price bin with highest revenue
top_bin = bin_revenue.loc[bin_revenue['Total Revenue'].idxmax()]
optimal_prices.append({'Product': product, 'Optimal Price Range': str(top_bin['Price Bin']), 'Revenue': top_bin['Total Revenue']})
# Plot for each product
plt.figure(figsize=(8,4))
sns.barplot(data=bin_revenue, x='Price Bin', y='Total Revenue', color='skyblue')
plt.title(f'Revenue by Unit Price Bin for {product}')
plt.ylabel('Total Revenue')
plt.xlabel('Unit Price Bin')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Show optimal price ranges
df_optimal = pd.DataFrame(optimal_prices)
print('Suggested Optimal Price Ranges by Product:')
display(df_optimal)
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\742797135.py:19: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
prod_df['Price Bin'] = pd.cut(prod_df['Unit Price'], bins=bins, include_lowest=True)
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\742797135.py:21: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
bin_revenue = prod_df.groupby('Price Bin')['Total Revenue'].sum().reset_index()
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\742797135.py:19: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
prod_df['Price Bin'] = pd.cut(prod_df['Unit Price'], bins=bins, include_lowest=True)
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\742797135.py:21: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
bin_revenue = prod_df.groupby('Price Bin')['Total Revenue'].sum().reset_index()
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\742797135.py:19: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
prod_df['Price Bin'] = pd.cut(prod_df['Unit Price'], bins=bins, include_lowest=True)
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\742797135.py:21: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
bin_revenue = prod_df.groupby('Price Bin')['Total Revenue'].sum().reset_index()
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\742797135.py:19: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
prod_df['Price Bin'] = pd.cut(prod_df['Unit Price'], bins=bins, include_lowest=True)
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\742797135.py:21: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
bin_revenue = prod_df.groupby('Price Bin')['Total Revenue'].sum().reset_index()
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\742797135.py:19: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
prod_df['Price Bin'] = pd.cut(prod_df['Unit Price'], bins=bins, include_lowest=True)
C:\Users\2283297\AppData\Local\Temp\ipykernel_8004\742797135.py:21: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
bin_revenue = prod_df.groupby('Price Bin')['Total Revenue'].sum().reset_index()
Suggested Optimal Price Ranges by Product:
| Product | Optimal Price Range | Revenue | |
|---|---|---|---|
| 0 | Phone | (907.0, 957.0] | 289011 |
| 1 | Laptop | (852.0, 902.0] | 357014 |
| 2 | Monitor | (719.0, 769.0] | 341975 |
| 3 | Keyboard | (754.0, 804.0] | 283838 |
| 4 | Tablet | (950.0, 1000.0] | 497312 |
Sales Strategy Insights & RecommendationsΒΆ
- Optimize Pricing: Focus on price ranges that historically drive the highest revenue for each product. Adjust pricing strategies to maximize revenue in these optimal bins.
- Monitor Inventory for High Demand: Products with frequent top 10% sales or those at risk of inventory shortfall should have proactive reorder strategies to avoid lost sales.
- Targeted Promotions: Use monthly and regional demand heatmaps to time and localize promotions, especially in months or regions with lower sales or higher shortfall.
- Product Bundling: Leverage top product combinations by region to create bundled offers that maximize revenue and cross-sell opportunities.
- Address Revenue Drops: Investigate and address days or periods with significant revenue drops (>30%) to identify operational or market issues.
- Focus on Most Profitable Products: Allocate marketing and inventory resources to the most profitable products in each region.
- Improve Data Quality: Regularly audit for missing, inconsistent, or anomalous data to ensure reliable decision-making.
- Balance Revenue and Units Sold: Use dual-axis and efficiency analyses to balance strategies between maximizing revenue and increasing sales volume.
- Adjust for Seasonality: Plan for inventory and promotions around months or days of the week with peak demand or revenue.
- Elasticity-Aware Pricing: Consider demand elasticity when adjusting prices to avoid revenue loss from reduced sales volume.
These insights are based on the analyses and visualizations in this notebook. For best results, regularly update your data and revisit these strategies as market conditions change.